library(here)
library(mellonMisc)
library(haven)
library(readxl)
library(ggplot2)
library(survey)
library(reshape2)
library(haven)

setwd(here())
load(file = "data/intermediate/pf_results.rda")
load(file = "data/intermediate/comb_sd.rda")

weightCreate <- function(base) {
  base ^ (pf$Imp.est / comb.sd)
}


pf$weight.be <- prop.table(weightCreate(exp(1)))
pf$weight.b1.9 <- prop.table(weightCreate(1.9))
pf$weight.b2 <- prop.table(weightCreate(2))
pf$weight.b2.5 <- prop.table(weightCreate(2.5))
pf$linear.weight <- prop.table(pf$Imp.est - min(pf$Imp.est))

bes.mii <- read_stata("data/raw/besmiiw12w13.dta")
bes.mii <- bes.mii %>% filter(!is.na(bes.mii$wt_new_W12) | !is.na(bes.mii$wt_new_W13))
bes.mii$miiW13 <- as.character(as_factor(bes.mii$mii_catW13))
bes.mii$miiW13[bes.mii$miiW13 %in% c("Referendum unspecified", "uncoded", "election outcome", "economy-personal",
                                     "partisan-neg", "pol-neg")] <- NA

bes.d <- makeDesign(data = dtf(bes.mii), id.var = "id", weight.var = "wt_new_W13")
lb.media <- read_excel("data/raw/loughborough_media_results.xlsx", 1)

weightCor <- function(base, comparator, main = "Conjoint") {
  iss.areas <- dtf(Conjoint = prop.table(tapply(weightCreate(base), pf$agreed_cat_string, sum)), 
                   linear = prop.table(tapply(pf$linear.weight, pf$agreed_cat_string, sum)), 
                   Count = prop.table(table(pf$agreed_cat_string)))
  
  iss.areas$Issue <- iss.areas$Count.Var1
  iss.areas$Count.Var1 <- NULL
  colnames(iss.areas)[colnames(iss.areas)=="Count.Freq"] <- "Equal"
  
  iss.full <- iss.areas
  
  
  
  issues <- prop.table(svytable(design = bes.d, ~miiW13))
  
  iss.full$BES <- issues[match(iss.full$Issue, names(issues))]
  
  iss.full$BES[iss.full$Issue=='other'] <- 
    iss.full$BES[iss.full$Issue=='other'] + (1 -sum(iss.full$BES))
  
  iss.full$BES <- as.numeric(iss.full$BES)
  iss.full$Conjoint <- as.numeric(iss.full$Conjoint)
  iss.full$important <- iss.full$Conjoint>0.05 | iss.full$Equal>0.05 | iss.full$BES>0.05
  
  
  small.issues <- names(which(sort(rowSums(iss.areas[, 1:3])) < 0.03))
  small.issues  <- c(small.issues, "other")
  oth.tots <- colSums(iss.areas[iss.areas$Issue %in% small.issues, 1:3])
  
  oth.tots <- dtf(t(matrix(oth.tots)), "Other")
  
  colnames(oth.tots) <- c("Conjoint", "linear", "Equal", "Issue")
  
  iss.areas <- rbind(iss.areas[!iss.areas$Issue %in% small.issues, ], oth.tots)
  
  iss.order <- names(sort(rowSums(iss.areas[-nrow(iss.areas), 1:3]), decreasing = T))
  iss.order <- c(iss.order, "Other")
  
  iss.areas$Issue <- factor(iss.areas$Issue, levels = iss.order[length(iss.order):1])
  
  iss.areas <- melt(iss.areas)
  iss.areas$Weighting <- iss.areas$variable
  levels(iss.areas$Issue) <- Hmisc::capitalize(levels(iss.areas$Issue))
  
  lb.lookup <- read_excel("data/raw/cmpToBESMII.xlsx", "bes_lboro")
  
  iss.full$lboro <- lb.lookup$lborough[match(rownames(iss.full), lb.lookup$MII_cat1)]
  
  lboro.comp <- aggregate(iss.full[, c("Conjoint", "linear", "Equal", "BES")], 
                          list(Issue = iss.full$lboro), sum)
  lboro.comp$Media <- lb.media$`media reweighted`[match(lboro.comp$Issue, lb.media$Issue)]

  cor.mat.lboro <- cor(lboro.comp[-1])
  
  return(cor.mat.lboro[main, comparator])
}

test.vals <- seq(1, 5, 0.1)
base.tests.media <- sapply(test.vals, weightCor, comparator = "Media")
base.tests.mii <- sapply(test.vals, weightCor, comparator = "BES")
base.test <- dtf(base = test.vals, Media = base.tests.media, MII = base.tests.mii)

base.test <- melt(base.test, id.var = "base")

base.validation <- ggplot(base.test, aes(x = base, y = value, group = variable, colour = variable)) + 
  geom_line() + 
  geom_vline(xintercept = base.test$base[base.test$variable=="Media"][
    which.max(base.test$value[base.test$variable=="Media"])], 
    linetype = 2, 
    colour = bes_col(1)) + 
  geom_vline(xintercept = base.test$base[base.test$variable=="MII"][
    which.max(base.test$value[base.test$variable=="MII"])], 
    linetype = 2, colour = bes_col(2)[2]) + 
  scale_color_manual(values = bes_col()) + 
  ylab("Correlation between promises weights\nand validation data (by issue area)") + 
  theme_minimal() + geom_hline(yintercept = 0, size = 2) + 
  xlab("Base used for promise weighting exponentiation")


# plot(x = test.vals, y = base.tests)


#### Output ####

# Appendix G: 

# a base of 2.5 would reduce the proportion of the Conservatives’ manifesto 
# about Europe from 35% 
prop.table(tapply(pf$weight.be, pf$agreed_cat_string, sum))["europe"]
# from 30% 
prop.table(tapply(pf$weight.b2.5, pf$agreed_cat_string, sum))["europe"]
# and a base of 1.9 would reduce it to 18%
prop.table(tapply(pf$weight.b1.9, pf$agreed_cat_string, sum))["europe"]

#  Both of these are far closer to the weighted value we find than the 
# unweighted proportion of 5%. 
prop.table(table(pf$agreed_cat_string))["europe"]

# if we were to use linear scaling for the weights,  the correlation between 
# the BES MII scores and promise weights would only 0.41 
weightCor(base = NA, main = "linear", comparator = "BES")
# and 0.66 for media
weightCor(base = NA, main = "linear", comparator = "Media")

# Even using this approach, the importance of Europe would more than double
# (compared to equal weighting) to 11% of all promises. 
sort(tapply(pf$linear.weight, pf$agreed_cat_string, sum))

# Figure 11 Correlations between promises and Media/BES MII measure of
# issue importance with promise weights calculated using different bases for
# exponentiation. 
saveForPub(base.validation, file = "figures/base_validation", width = 6, height = 3.5)
